import pandas as pd
from sklearn.model_selection import train_test_split
bank_df = pd.read_csv("../../data/bank_small.csv")
bank_df
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30 | unemployed | married | primary | no | 1787 | no | no | cellular | 19 | oct | 79 | 1 | -1 | 0 | unknown | no |
| 1 | 33 | services | married | secondary | no | 4789 | yes | yes | cellular | 11 | may | 220 | 1 | 339 | 4 | failure | no |
| 2 | 35 | management | single | tertiary | no | 1350 | yes | no | cellular | 16 | apr | 185 | 1 | 330 | 1 | failure | no |
| 3 | 30 | management | married | tertiary | no | 1476 | yes | yes | unknown | 3 | jun | 199 | 4 | -1 | 0 | unknown | no |
| 4 | 59 | blue-collar | married | secondary | no | 0 | yes | no | unknown | 5 | may | 226 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4516 | 33 | services | married | secondary | no | -333 | yes | no | cellular | 30 | jul | 329 | 5 | -1 | 0 | unknown | no |
| 4517 | 57 | self-employed | married | tertiary | yes | -3313 | yes | yes | unknown | 9 | may | 153 | 1 | -1 | 0 | unknown | no |
| 4518 | 57 | technician | married | secondary | no | 295 | no | no | cellular | 19 | aug | 151 | 11 | -1 | 0 | unknown | no |
| 4519 | 28 | blue-collar | married | secondary | no | 1137 | no | no | cellular | 6 | feb | 129 | 4 | 211 | 3 | other | no |
| 4520 | 44 | entrepreneur | single | tertiary | no | 1136 | yes | yes | cellular | 3 | apr | 345 | 2 | 249 | 7 | other | no |
4521 rows × 17 columns
# Citation Request:
# This dataset is public available for research. The details are described
# in [Moro et al., 2011]. Please include this citation if you plan to use this database:
#
# [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank
# Direct Marketing: An Application of the CRISP-DM Methodology.
# In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling
# Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.
#
# Available at: [pdf] http://hdl.handle.net/1822/14838
# [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt
#
# 1. Title: Bank Marketing
#
# 2. Sources
# Created by: Paulo Cortez (Univ. Minho) and Sérgio Moro (ISCTE-IUL) @ 2012
#
# 3. Past Usage:
#
# The full dataset was described and analyzed in:
#
# S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology.
# In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães,
# Portugal, October, 2011. EUROSIS.
#
# 4. Relevant Information:
#
# The data is related with direct marketing campaigns of a Portuguese banking institution.
# The marketing campaigns were based on phone calls. Often, more than one contact to the
# same client was required, in order to access if the product (bank term deposit)
# would be (or not) subscribed.
#
# There are two datasets:
# 1) bank-full.csv with all examples, ordered by date (from May 2008 to November 2010).
# 2) bank.csv with 10% of the examples (4521), randomly selected from bank-full.csv.
# The smallest dataset is provided to test more computationally demanding machine
# learning algorithms (e.g. SVM).
#
# The classification goal is to predict if the client will subscribe a term deposit (variable y).
#
# 5. Number of Instances: 45211 for bank-full.csv (4521 for bank.csv)
#
# 6. Number of Attributes: 16 + output attribute (y).
#
# 7. Attribute information:
#
# For more information, read [Moro et al., 2011].
#
# Input variables:
# # bank client data:
# 1 - age (numeric)
# 2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid",
# "entrepreneur","student","blue-collar","self-employed","retired","technician","services")
# 3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
# 4 - education (categorical: "unknown","secondary","primary","tertiary")
# 5 - default: has credit in default? (binary: "yes","no")
# 6 - balance: average yearly balance, in euros (numeric)
# 7 - housing: has housing loan? (binary: "yes","no")
# 8 - loan: has personal loan? (binary: "yes","no")
#
# # related with the last contact of the current campaign:
# 9 - contact: contact communication type (categorical: "unknown","telephone","cellular")
# 10 - day: last contact day of the month (numeric)
# 11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
# 12 - duration: last contact duration, in seconds (numeric)
#
# # other attributes:
# 13 - campaign: number of contacts performed during this campaign and for this client
# (numeric, includes last contact)
# 14 - pdays: number of days that passed by after the client was last contacted from
# a previous campaign (numeric, -1 means client was not previously contacted)
# 15 - previous: number of contacts performed before this campaign and for this client (numeric)
# 16 - poutcome: outcome of the previous marketing campaign
# (categorical: "unknown","other","failure","success")
#
# Output variable (desired target):
# 17 - y - has the client subscribed a term deposit? (binary: "yes","no")
bank_df.describe()
| age | balance | day | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| count | 4521.000000 | 4521.000000 | 4521.000000 | 4521.000000 | 4521.000000 | 4521.000000 | 4521.000000 |
| mean | 41.170095 | 1422.657819 | 15.915284 | 263.961292 | 2.793630 | 39.766645 | 0.542579 |
| std | 10.576211 | 3009.638142 | 8.247667 | 259.856633 | 3.109807 | 100.121124 | 1.693562 |
| min | 19.000000 | -3313.000000 | 1.000000 | 4.000000 | 1.000000 | -1.000000 | 0.000000 |
| 25% | 33.000000 | 69.000000 | 9.000000 | 104.000000 | 1.000000 | -1.000000 | 0.000000 |
| 50% | 39.000000 | 444.000000 | 16.000000 | 185.000000 | 2.000000 | -1.000000 | 0.000000 |
| 75% | 49.000000 | 1480.000000 | 21.000000 | 329.000000 | 3.000000 | -1.000000 | 0.000000 |
| max | 87.000000 | 71188.000000 | 31.000000 | 3025.000000 | 50.000000 | 871.000000 | 25.000000 |
seed = 123 # for reproducibility
X = bank_df.drop('y', axis=1) # Features (all columns except 'y')
y = bank_df['y'] # Target variable
# We need to encode the categorical variables
X = pd.get_dummies(data = X, drop_first = True)
#split our dataset into training and test set using the outcome variable as stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = seed)
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Define the SVM classifier and parameter grid for tuning
svm = SVC(kernel='rbf', random_state = seed) # Radial basis function kernel
param_grid = {'C': [0.25, 0.50, 1, 2, 4, 8, 16, 32, 64, 128]}
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "mean_test_score"]]
results_df
| param_C | mean_test_score | |
|---|---|---|
| 0 | 0.25 | 0.885510 |
| 1 | 0.5 | 0.886891 |
| 2 | 1 | 0.888829 |
| 3 | 2 | 0.892151 |
| 4 | 4 | 0.893527 |
| 5 | 8 | 0.892977 |
| 6 | 16 | 0.889103 |
| 7 | 32 | 0.882746 |
| 8 | 64 | 0.881916 |
| 9 | 128 | 0.875831 |
ax = results_df.plot(x = "param_C", y = "mean_test_score", marker = '.')
ax.set_xlabel("Cost")
ax.set_ylabel("Accuracy (cross-validation)")
Text(0, 0.5, 'Accuracy (cross-validation)')
# sigma = gamma in SVC
# *This cell takes some time to run*
svm = SVC(kernel='rbf', random_state = seed) # Radial basis function kernel
param_grid = {'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'gamma': [0.016, 0.017, 0.018, 0.019, 0.020]}
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "param_gamma", "mean_test_score"]]
results_df
| param_C | param_gamma | mean_test_score | |
|---|---|---|---|
| 0 | 1 | 0.016 | 0.889658 |
| 1 | 1 | 0.017 | 0.889659 |
| 2 | 1 | 0.018 | 0.889106 |
| 3 | 1 | 0.019 | 0.889106 |
| 4 | 1 | 0.02 | 0.888276 |
| 5 | 2 | 0.016 | 0.888281 |
| 6 | 2 | 0.017 | 0.888004 |
| 7 | 2 | 0.018 | 0.888559 |
| 8 | 2 | 0.019 | 0.888558 |
| 9 | 2 | 0.02 | 0.888281 |
| 10 | 3 | 0.016 | 0.892431 |
| 11 | 3 | 0.017 | 0.891877 |
| 12 | 3 | 0.018 | 0.892151 |
| 13 | 3 | 0.019 | 0.892981 |
| 14 | 3 | 0.02 | 0.891598 |
| 15 | 4 | 0.016 | 0.892427 |
| 16 | 4 | 0.017 | 0.892981 |
| 17 | 4 | 0.018 | 0.894086 |
| 18 | 4 | 0.019 | 0.893809 |
| 19 | 4 | 0.02 | 0.893809 |
| 20 | 5 | 0.016 | 0.893810 |
| 21 | 5 | 0.017 | 0.894363 |
| 22 | 5 | 0.018 | 0.894360 |
| 23 | 5 | 0.019 | 0.892976 |
| 24 | 5 | 0.02 | 0.891870 |
| 25 | 6 | 0.016 | 0.895742 |
| 26 | 6 | 0.017 | 0.894911 |
| 27 | 6 | 0.018 | 0.894358 |
| 28 | 6 | 0.019 | 0.894081 |
| 29 | 6 | 0.02 | 0.892975 |
| 30 | 7 | 0.016 | 0.896016 |
| 31 | 7 | 0.017 | 0.894357 |
| 32 | 7 | 0.018 | 0.893804 |
| 33 | 7 | 0.019 | 0.892421 |
| 34 | 7 | 0.02 | 0.891867 |
| 35 | 8 | 0.016 | 0.894080 |
| 36 | 8 | 0.017 | 0.892974 |
| 37 | 8 | 0.018 | 0.892697 |
| 38 | 8 | 0.019 | 0.891868 |
| 39 | 8 | 0.02 | 0.891593 |
| 40 | 9 | 0.016 | 0.893527 |
| 41 | 9 | 0.017 | 0.892421 |
| 42 | 9 | 0.018 | 0.891592 |
| 43 | 9 | 0.019 | 0.891870 |
| 44 | 9 | 0.02 | 0.892424 |
| 45 | 10 | 0.016 | 0.893252 |
| 46 | 10 | 0.017 | 0.892421 |
| 47 | 10 | 0.018 | 0.891869 |
| 48 | 10 | 0.019 | 0.892977 |
| 49 | 10 | 0.02 | 0.892147 |
import plotly.express as px
fig = px.line(results_df, x = "param_C", y = "mean_test_score", color = "param_gamma", symbol = "param_gamma")
fig
# best model
grid_search.best_estimator_
SVC(C=7, gamma=0.016, random_state=123)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=7, gamma=0.016, random_state=123)
from sklearn.metrics import confusion_matrix
y_pred = grid_search.best_estimator_.predict(X_train_scaled) # Get predictions from the best model
conf_matrix = confusion_matrix(y_train, y_pred)
conf_matrix
array([[3187, 12],
[ 130, 287]], dtype=int64)
grid_search.classes_
array(['no', 'yes'], dtype=object)
# Making the visualization of the confusion matrix better
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["No", "Yes"])
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x243faf8a820>
#Model validation on the test set
X_test_scaled = scaler.fit_transform(X_test)
y_pred = grid_search.best_estimator_.predict(X_test_scaled) # Get predictions from the best model
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
array([[777, 24],
[ 69, 35]], dtype=int64)
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["No", "Yes"])
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f2e14d7070>
import numpy as np
# *This cell takes time to run*
param_grid = {
'gamma': [10 ** i for i in range(-5, 0)],
'C': [10 ** i for i in range(-3, 2)]
}
svm = SVC(kernel='rbf', random_state = seed)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_model
SVC(C=10, gamma=0.01, random_state=123)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.01, random_state=123)
# All results
results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "param_gamma", "mean_test_score"]]
results_df
| param_C | param_gamma | mean_test_score | |
|---|---|---|---|
| 0 | 0.001 | 0.00001 | 0.884680 |
| 1 | 0.001 | 0.0001 | 0.884680 |
| 2 | 0.001 | 0.001 | 0.884680 |
| 3 | 0.001 | 0.01 | 0.884680 |
| 4 | 0.001 | 0.1 | 0.884680 |
| 5 | 0.01 | 0.00001 | 0.884680 |
| 6 | 0.01 | 0.0001 | 0.884680 |
| 7 | 0.01 | 0.001 | 0.884680 |
| 8 | 0.01 | 0.01 | 0.884680 |
| 9 | 0.01 | 0.1 | 0.884680 |
| 10 | 0.1 | 0.00001 | 0.884680 |
| 11 | 0.1 | 0.0001 | 0.884680 |
| 12 | 0.1 | 0.001 | 0.884680 |
| 13 | 0.1 | 0.01 | 0.884680 |
| 14 | 0.1 | 0.1 | 0.884680 |
| 15 | 1 | 0.00001 | 0.884680 |
| 16 | 1 | 0.0001 | 0.884680 |
| 17 | 1 | 0.001 | 0.891867 |
| 18 | 1 | 0.01 | 0.890486 |
| 19 | 1 | 0.1 | 0.886065 |
| 20 | 10 | 0.00001 | 0.884680 |
| 21 | 10 | 0.0001 | 0.891867 |
| 22 | 10 | 0.001 | 0.891590 |
| 23 | 10 | 0.01 | 0.897955 |
| 24 | 10 | 0.1 | 0.879977 |
#it looks like \gamma=0.01-0.1 and C=1-10 does better job
# *This cell takes time to run*
param_grid = {
'gamma': np.arange(0.01, 0.13, 0.01),
'C': np.arange(1,13,1)
}
svm = SVC(kernel='rbf', random_state = seed)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_model
SVC(C=10, gamma=0.01, random_state=123)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.01, random_state=123)
# All results
results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "param_gamma", "mean_test_score"]]
results_df
| param_C | param_gamma | mean_test_score | |
|---|---|---|---|
| 0 | 1 | 0.01 | 0.890486 |
| 1 | 1 | 0.02 | 0.888276 |
| 2 | 1 | 0.03 | 0.889107 |
| 3 | 1 | 0.04 | 0.889107 |
| 4 | 1 | 0.05 | 0.889383 |
| ... | ... | ... | ... |
| 139 | 12 | 0.08 | 0.878872 |
| 140 | 12 | 0.09 | 0.880530 |
| 141 | 12 | 0.1 | 0.879148 |
| 142 | 12 | 0.11 | 0.878872 |
| 143 | 12 | 0.12 | 0.879703 |
144 rows × 3 columns
# best accuracy
results_df[results_df["mean_test_score"] == results_df["mean_test_score"].max()]
| param_C | param_gamma | mean_test_score | |
|---|---|---|---|
| 108 | 10 | 0.01 | 0.897955 |
#Model validation on the test set
y_pred = grid_search.best_estimator_.predict(X_test_scaled) # Get predictions from the best model
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
array([[782, 19],
[ 67, 37]], dtype=int64)
# Identify the order of the classes.
grid_search.classes_
array(['no', 'yes'], dtype=object)
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["No", "Yes"])
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f2e16bbca0>
# Accuracy. It seems that it performs even better on the test set.
accuracy = np.mean(y_pred == y_test)
accuracy
0.9049723756906077
# 95% Confidence Interval
from statsmodels.stats.proportion import proportion_confint
confidence_interval = proportion_confint(count=np.sum(y_pred == y_test), nobs=len(y_pred))
confidence_interval
(0.88586652415365, 0.9240782272275655)